# coding=utf-8

import xml.etree.ElementTree as ET
import html


from red import *

from worker_manage import worker, dataparser

from fetcher import *
from datadefine import *

def de_html_char(text):
    '''去掉html转义'''

    if text == None:
        return ''

    # 去标签
    text = red.sub(r'''<.*?>''', r'', text)

    # 去转义
    text = html.unescape(text)

    # 0xe38080转空格
    text = text.replace('　', ' ')

    # 空白
    text = text.replace('\n', ' ')
    text = red.sub(r'\s{3,}', r'  ', text)
    text = text.strip()
    
    return text


# rss1.0, rss2.0, atom
tagnames = {
        'f_author': ('channel/title', 'channel/title', 'title'),
        'f_link'  : ('channel/link', 'channel/link', 'link'),
        'f_items' : ('channel/item', 'channel/item', 'entry'),
        
        'title'   : ('title', 'title', 'title'),
        'url'     : ('link', 'link', 'link'),
        
        'author'  : ('author', 'author', 'author'),
        'summary' : ('description', 'description', 'summary'),
        'pub_date': ('dc:date', 'pubDate', 'updated'),
        
        'guid'    : ('guid', 'guid', 'id')
        }

# parse xml
def parse_xml(data_dict, xml):
    if not xml:
        raise c_worker_exception('XML为空字符串')

    # remove namespace of atom
    xml = red.sub('<feed\s+.*?>', 
                  '<feed>', 
                  xml, 
                  count=1,
                  flags=red.IGNORECASE)

    try:
        doc = ET.fromstring(xml)
    except Exception as e:
        raise c_worker_exception('解析XML失败', 
                                 data_dict['url'], 
                                 str(e)
                                 )

    # get type of the feed
    if doc.tag == 'rss' and doc.get('version', '') == '1.0':
        feedtype = 0
    elif doc.tag == 'rss' and doc.get('version', '') == '2.0':
        feedtype = 1
    elif doc.tag == 'feed':
        feedtype = 2
    else:
        raise c_worker_exception('无法识别XML', data_dict['url'], '')

    # get feed title and link
    f_author = de_html_char(doc.findtext(tagnames['f_author'][feedtype]))
    f_link = de_html_char(doc.findtext(tagnames['f_link'][feedtype]))

    ret = []
    for item in doc.findall(tagnames['f_items'][feedtype]):
        
        title = de_html_char(item.findtext(tagnames['title'][feedtype]))
        if feedtype < 2:
            url = de_html_char(item.findtext(tagnames['url'][feedtype]))
        else:
            url = ''
            for tag_link in item.findall('link'):
                if tag_link.get('rel') == 'alternate':
                    url = url or tag_link.get('href')

        author = de_html_char(item.findtext(tagnames['author'][feedtype],
                                            f_author))
        summary = de_html_char(item.findtext(tagnames['summary'][feedtype]))
        pubdate = de_html_char(item.findtext(tagnames['pub_date'][feedtype]))
        
        guid = item.findtext(tagnames['guid'][feedtype])

        # info
        one = c_info()
        
        one.title = title
        one.url = url
        
        one.author = author
        one.summary = summary
        one.pub_date = pubdate
        
        one.suid = guid if guid else url

        ret.append(one)

    return ret

# download and parse
@worker('rss_atom')
def download_process(data_dict, worker_dict):
    url = data_dict['url']
    encoding = data_dict.get('encoding', '')

    f = Fetcher()
    xml = f.fetch_html(url, encoding)

    return parse_xml(data_dict, xml)


@dataparser('rss_atom')
def rss_atom_parser(xml_string):
    d = dict()
    data = ET.fromstring(xml_string).find('data')

    url_tag = data.find('url')
    d['url'] = url_tag.text.strip()
    d['encoding'] = url_tag.attrib.get('encoding', '').strip()

    return d
